In [1]:
import warnings
warnings.filterwarnings("ignore", message="use_inf_as_na option is deprecated")
In [2]:
# Importem llibreries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
In [3]:
# Guardem en una variable el path del dataset
path_scatter = "data/gym_members_exercise_tracking.csv"
# Generem el dataframe
df_scatter = pd.read_csv(path_scatter) # Inspecció inicial
# Veiem una distribució de les variables numèriques
df_scatter.describe()
Out[3]:
| Age | Weight (kg) | Height (m) | Max_BPM | Avg_BPM | Resting_BPM | Session_Duration (hours) | Calories_Burned | Fat_Percentage | Water_Intake (liters) | Workout_Frequency (days/week) | Experience_Level | BMI | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 973.000000 | 973.000000 | 973.00000 | 973.000000 | 973.000000 | 973.000000 | 973.000000 | 973.000000 | 973.000000 | 973.000000 | 973.000000 | 973.000000 | 973.000000 |
| mean | 38.683453 | 73.854676 | 1.72258 | 179.883864 | 143.766701 | 62.223022 | 1.256423 | 905.422405 | 24.976773 | 2.626619 | 3.321686 | 1.809866 | 24.912127 |
| std | 12.180928 | 21.207500 | 0.12772 | 11.525686 | 14.345101 | 7.327060 | 0.343033 | 272.641516 | 6.259419 | 0.600172 | 0.913047 | 0.739693 | 6.660879 |
| min | 18.000000 | 40.000000 | 1.50000 | 160.000000 | 120.000000 | 50.000000 | 0.500000 | 303.000000 | 10.000000 | 1.500000 | 2.000000 | 1.000000 | 12.320000 |
| 25% | 28.000000 | 58.100000 | 1.62000 | 170.000000 | 131.000000 | 56.000000 | 1.040000 | 720.000000 | 21.300000 | 2.200000 | 3.000000 | 1.000000 | 20.110000 |
| 50% | 40.000000 | 70.000000 | 1.71000 | 180.000000 | 143.000000 | 62.000000 | 1.260000 | 893.000000 | 26.200000 | 2.600000 | 3.000000 | 2.000000 | 24.160000 |
| 75% | 49.000000 | 86.000000 | 1.80000 | 190.000000 | 156.000000 | 68.000000 | 1.460000 | 1076.000000 | 29.300000 | 3.100000 | 4.000000 | 2.000000 | 28.560000 |
| max | 59.000000 | 129.900000 | 2.00000 | 199.000000 | 169.000000 | 74.000000 | 2.000000 | 1783.000000 | 35.000000 | 3.700000 | 5.000000 | 3.000000 | 49.840000 |
In [4]:
# Veiem les primeres files del conjunt
df_scatter.head()
Out[4]:
| Age | Gender | Weight (kg) | Height (m) | Max_BPM | Avg_BPM | Resting_BPM | Session_Duration (hours) | Calories_Burned | Workout_Type | Fat_Percentage | Water_Intake (liters) | Workout_Frequency (days/week) | Experience_Level | BMI | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 56 | Male | 88.3 | 1.71 | 180 | 157 | 60 | 1.69 | 1313.0 | Yoga | 12.6 | 3.5 | 4 | 3 | 30.20 |
| 1 | 46 | Female | 74.9 | 1.53 | 179 | 151 | 66 | 1.30 | 883.0 | HIIT | 33.9 | 2.1 | 4 | 2 | 32.00 |
| 2 | 32 | Female | 68.1 | 1.66 | 167 | 122 | 54 | 1.11 | 677.0 | Cardio | 33.4 | 2.3 | 4 | 2 | 24.71 |
| 3 | 25 | Male | 53.2 | 1.70 | 190 | 164 | 56 | 0.59 | 532.0 | Strength | 28.8 | 2.1 | 3 | 1 | 18.41 |
| 4 | 38 | Male | 46.1 | 1.79 | 188 | 158 | 68 | 0.64 | 556.0 | Strength | 29.2 | 2.8 | 3 | 1 | 14.39 |
In [5]:
# Creem un scatterplot que mostra les calories cremades per cada hora d'entrenament
plt.figure(figsize=(10, 10))
sns.scatterplot(data=df_scatter, x='Session_Duration (hours)',
y='Calories_Burned',
hue = 'Gender',
alpha=0.6)
plt.title('Calories cremades per Duració de l\'entrenament')
plt.xlabel('Duració de l\'entrenament en Hores')
plt.ylabel('Calories cremades')
plt.show()
In [6]:
# Guardem en una variable el path del dataset
path_sunburst = "data/udemy_online_education_courses_dataset.csv"
# Generem el dataframe
df_sunburst = pd.read_csv(path_sunburst) # Inspecció inicial
# Veiem una distribució de les variables numèriques
df_sunburst.describe()
Out[6]:
| course_id | price | num_subscribers | num_reviews | num_lectures | content_duration | |
|---|---|---|---|---|---|---|
| count | 3.678000e+03 | 3678.000000 | 3678.000000 | 3678.000000 | 3678.000000 | 3678.000000 |
| mean | 6.759720e+05 | 66.049483 | 3197.150625 | 156.259108 | 40.108755 | 4.094517 |
| std | 3.432732e+05 | 61.005755 | 9504.117010 | 935.452044 | 50.383346 | 6.053840 |
| min | 8.324000e+03 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 4.076925e+05 | 20.000000 | 111.000000 | 4.000000 | 15.000000 | 1.000000 |
| 50% | 6.879170e+05 | 45.000000 | 911.500000 | 18.000000 | 25.000000 | 2.000000 |
| 75% | 9.613555e+05 | 95.000000 | 2546.000000 | 67.000000 | 45.750000 | 4.500000 |
| max | 1.282064e+06 | 200.000000 | 268923.000000 | 27445.000000 | 779.000000 | 78.500000 |
In [7]:
# Veiem les primeres files del conjunt
df_sunburst.head()
Out[7]:
| course_id | course_title | url | is_paid | price | num_subscribers | num_reviews | num_lectures | level | content_duration | published_timestamp | subject | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1070968 | Ultimate Investment Banking Course | https://www.udemy.com/ultimate-investment-bank... | True | 200 | 2147 | 23 | 51 | All Levels | 1.5 | 2017-01-18T20:58:58Z | Business Finance |
| 1 | 1113822 | Complete GST Course & Certification - Grow You... | https://www.udemy.com/goods-and-services-tax/ | True | 75 | 2792 | 923 | 274 | All Levels | 39.0 | 2017-03-09T16:34:20Z | Business Finance |
| 2 | 1006314 | Financial Modeling for Business Analysts and C... | https://www.udemy.com/financial-modeling-for-b... | True | 45 | 2174 | 74 | 51 | Intermediate Level | 2.5 | 2016-12-19T19:26:30Z | Business Finance |
| 3 | 1210588 | Beginner to Pro - Financial Analysis in Excel ... | https://www.udemy.com/complete-excel-finance-c... | True | 95 | 2451 | 11 | 36 | All Levels | 3.0 | 2017-05-30T20:07:24Z | Business Finance |
| 4 | 1011058 | How To Maximize Your Profits Trading Options | https://www.udemy.com/how-to-maximize-your-pro... | True | 200 | 1276 | 45 | 26 | Intermediate Level | 2.0 | 2016-12-13T14:57:18Z | Business Finance |
In [8]:
# Afegim el nom de la plataforma d'estudi
df_sunburst['Platform'] = "Udemy"
# Agrupem el conjunt de dades per plataforma, subjecte i nivell
df_grouped = df_sunburst.groupby(['Platform','subject', 'level']).agg(
total_subscribers=('num_subscribers', 'sum') # Sumem el nombre total de suscriptors
).reset_index()
# Creeem el gràfic sunburst
fig = px.sunburst(df_grouped,
path=['Platform','subject', 'level'],
values='total_subscribers',
title='Nombre de Subscriptors a Udemy Online per Subjecte i Nivell')
# Definim la mida del gràfic
fig.update_layout(width=800, height=800)
# Mostrem el gràfic
fig.show()
In [9]:
# Guardem en una variable el path del dataset
path_ridgeline = "data/NorwayMeteoDataCompleted.csv"
# Generem el dataframe
df_ridgeline = pd.read_csv(path_ridgeline) # Inspecció inicial
# Veiem una distribució de les variables numèriques
df_ridgeline.describe()
Out[9]:
| Unnamed: 0 | latitude | longtitude | max(air_temperature P1D) | max(relative_humidity P1D) | max(wind_speed P1D) | mean(air_temperature P1D) | mean(relative_humidity P1D) | mean(wind_speed P1D) | sum(precipitation_amount P1D) | day | month | year | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 237629.000000 | 237629.000000 | 237629.000000 | 44766.000000 | 36965.000000 | 35260.000000 | 63551.000000 | 37411.000000 | 35260.000000 | 107937.000000 | 237629.000000 | 237629.000000 | 237629.000000 |
| mean | 118814.000000 | 60.120401 | 13.299210 | 7.296911 | 89.019965 | 7.343434 | 4.160783 | 75.800406 | 4.169682 | 2.405301 | 15.728215 | 6.521414 | 2015.562053 |
| std | 68597.727896 | 6.994602 | 8.083162 | 9.082600 | 9.875235 | 4.809745 | 8.169349 | 13.286090 | 3.328436 | 7.006714 | 8.797737 | 3.448545 | 3.433181 |
| min | 0.000000 | 36.666111 | -4.482222 | -25.700000 | 31.000000 | 0.000000 | -27.950000 | 20.000000 | 0.000000 | -1.000000 | 1.000000 | 1.000000 | 2010.000000 |
| 25% | 59407.000000 | 58.990000 | 8.094700 | 0.700000 | 85.000000 | 3.900000 | -1.200000 | 67.000000 | 1.800000 | 0.000000 | 8.000000 | 4.000000 | 2013.000000 |
| 50% | 118814.000000 | 60.863200 | 10.976200 | 7.000000 | 92.000000 | 6.100000 | 4.200000 | 77.000000 | 3.200000 | 0.100000 | 16.000000 | 7.000000 | 2016.000000 |
| 75% | 178221.000000 | 63.554000 | 19.197778 | 14.037500 | 96.000000 | 9.700000 | 10.300000 | 86.000000 | 5.500000 | 2.100000 | 23.000000 | 10.000000 | 2019.000000 |
| max | 237628.000000 | 70.335700 | 35.106944 | 36.200000 | 107.000000 | 35.500000 | 42.600000 | 100.000000 | 29.900000 | 928.000000 | 31.000000 | 12.000000 | 2021.000000 |
In [10]:
# Veiem les primeres files del conjunt
df_ridgeline.head()
Out[10]:
| Unnamed: 0 | sourceId | latitude | longtitude | max(air_temperature P1D) | max(relative_humidity P1D) | max(wind_speed P1D) | mean(air_temperature P1D) | mean(relative_humidity P1D) | mean(wind_speed P1D) | sum(precipitation_amount P1D) | day | month | year | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | SN100 | 61.134900 | 12.503900 | NaN | NaN | NaN | NaN | NaN | NaN | 0.4 | 1 | 1 | 2010 |
| 1 | 1 | SN1135 | 58.990000 | 11.540800 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1 | 1 | 2010 |
| 2 | 2 | SN1151800 | 50.100278 | 14.255556 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1 | 1 | 2010 |
| 3 | 3 | SN15262 | 61.674000 | 8.368500 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1 | 1 | 2010 |
| 4 | 4 | SN1531000 | 45.473056 | 28.032222 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1 | 1 | 2010 |
In [11]:
# Fem un mapeig dels mesos per mostrar el seu nom en català
month_mapping = {
1: "Gener", 2: "Febrer", 3: "Març", 4: "Abril",
5: "Maig", 6: "Juny", 7: "Juliol", 8: "Agost",
9: "Setembre", 10: "Octubre", 11: "Novembre", 12: "Desembre"
}
In [12]:
# Apliquem el mapeig dels números als noms de mesos
df_ridgeline['month'] = df_ridgeline['month'].map(month_mapping)
In [13]:
# Calculem la mitjana de temperatura per mes, per millorar la visualització amb el color
month_mean_serie = df_ridgeline.groupby('month')['mean(air_temperature P1D)'].mean()
# Afegim la columna 'mean_month' que conté la mitjana de temperatura mensual
df_ridgeline['mean_month'] = df_ridgeline['month'].map(month_mean_serie)
In [14]:
# Configurem de la paleta de colors per als mesos
pal = sns.color_palette("coolwarm", n_colors=12)
# Creem el ridgeline plot
g = sns.FacetGrid(df_ridgeline, row='month', hue='mean_month', aspect=15, height=0.75, palette=pal)
g.map(sns.kdeplot, 'mean(air_temperature P1D)',
bw_adjust=1, clip_on=False,
fill=True, alpha=1, linewidth=1.5)
g.map(sns.kdeplot, 'mean(air_temperature P1D)',
bw_adjust=1, clip_on=False,
color="w", lw=2)
g.map(plt.axhline, y=0, lw=2, clip_on=False)
# Afegim el nom de cada mes en el costat esquerre de cada gràfic
for i, ax in enumerate(g.axes.flat):
ax.text(df_ridgeline['mean(air_temperature P1D)'].min() - 5, 0.02, ax.get_title().split('=')[1].strip(),
fontweight='bold', fontsize=12,
color=ax.lines[-1].get_color())
# Ajustem els estils
g.set_titles("")
g.set(yticks=[])
g.set_ylabels()
g.despine(bottom=True, left=True)
# Configurem el títol general i l'etiqueta de l'eix X
plt.xlabel('Temperatura Mitjana de l\'Aire (°C)', fontweight='bold', fontsize=12)
# Centrem el títol general
g.fig.suptitle('Distribució de la Temperatura Mitjana de l\'Aire per Mes',
ha='center', fontsize=16, fontweight='bold')
plt.show()
In [ ]: